City and date indicators
# import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
plt.style.use('ggplot')
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBClassifier,XGBRegressor
from xgboost import plot_importance
from matplotlib import pyplot
import warnings
warnings.filterwarnings("ignore")
pwd
# read data
feature=pd.read_csv('DengAI_Predicting_Disease_Spread_-_Training_Data_Features.csv')
labels=pd.read_csv('DengAI_Predicting_Disease_Spread_-_Training_Data_Labels.csv')
testdata=pd.read_csv('DengAI_Predicting_Disease_Spread_-_Test_Data_Features.csv')
# join the labels
df = pd.merge(feature, labels, how='left', on=['city', 'year','weekofyear'])
print(feature.shape)
print(labels.shape)
print(df.shape)
df.head()
col_name=df.columns
col_name
# distribution plot of the columns
plt.rc("font", size=13)
plt.rcParams["figure.figsize"] = [30,25]
alpha=0.6
for i,col in enumerate(col_name[4:]):
plt.subplot(7, 3,i+1)
sns.kdeplot(df[str(col)],shade=True, color="b")
# check null values
df.isnull().sum()
# use forward fill on the NaNs - good since it's a timeseries
df.fillna(method='ffill', inplace=True)
testdata.fillna(method='ffill', inplace=True)
#df.isna().sum()
# summary statistics
df.describe()
# check correlation between variables
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
# 'RdBu_r' & 'BrBG', 'coolwarm' are other good diverging colormaps
# correlation between variables for both cities separately
# Separate data for San Juan and Iquitos cities
sj_df = df.loc[df.city=='sj']
iq_df = df.loc[df.city=='iq']
plt.rcParams["figure.figsize"] = [8,8]
sns.heatmap(sj_df.corr(),cmap="YlGnBu")
plt.title('San Juan Variable Correlations')
plt.show()
#plt.rcParams["figure.figsize"] = [8,8]
sns.heatmap(iq_df.corr(),cmap="YlGnBu")
plt.title('Iquitos Variable Correlations')
plt.show()
#distribution of total cases in two cities
plt.rcParams["figure.figsize"] = [8,6]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.distplot(sj_df['total_cases'], bins=12 ,kde=False, label='sj',color='b',ax=ax1) # bin size represent total number of case in a week
ax1.set_title('San Juan', fontsize=18)
ax1.set(ylabel='frequency')
sns.distplot(iq_df['total_cases'],bins=12, kde=False,label='iq',color='r',ax=ax2)
ax2.set_title('Iquitos', fontsize=18)
ax2.set(ylabel='frequency')
!pwd
# dengues case reported over time
import plotly.graph_objects as go
import datetime
fig = go.Figure([go.Scatter(x=sj_df['week_start_date'], y=sj_df['total_cases'],line_color='deepskyblue')])
fig.update_layout(title="Historical Dengue cases in San Juan ", xaxis_title="Year", yaxis_title="Case reported")
fig.show()
fig.write_image("images/Historical_case_SanJuan.png")
fig = go.Figure([go.Scatter(x=iq_df['week_start_date'], y=iq_df['total_cases'],line_color='green')])
fig.update_layout(title="Historical Dengue cases in Iqitous ", xaxis_title="Year", yaxis_title="Case reported")
fig.show()
fig.write_image("images/Historical_case_Iqitous.png")
# total cases per year
plt.rcParams["figure.figsize"] = [10,6]
# San Juan
tmp = sj_df.groupby(['year']).total_cases.sum()
tmp=tmp.reset_index()
ax=sns.barplot(x='year', y='total_cases', data=tmp,)
ax.set(xlabel='year', ylabel='Total cases per year in San Juan')#ax1.set(ylabel='total_cases in San Juan')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
## Iquitos
tmp= iq_df.groupby(['year']).total_cases.sum()
tmp=tmp.reset_index()
ax=sns.barplot(x='year', y='total_cases', data=tmp)
ax.set(xlabel='year', ylabel='Total cases per year in Iqitos')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
TS is said to be stationary if its statistical properties such as mean and variance remain constant over time. TS models work on the assumption that the TS is stationary.
Ref- https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
# function to check statinarity using visual and Dickey_Fuller Test
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = pd.Series(timeseries).rolling(window=3).mean()
rolstd = pd.Series(timeseries).rolling(window=3).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
tmp=df[['week_start_date','city','total_cases']]
tmp['week_start_date']=pd.to_datetime(tmp['week_start_date'])
tmp['month']=tmp['week_start_date'].dt.month # created month columns
tmp.index = tmp['week_start_date'] ### reset axis
del tmp['week_start_date']
sj=tmp[tmp['city']=='sj']
iq=tmp[tmp['city']=='iq']
plt.figure(figsize=(10,7))
test_stationarity(sj['total_cases'])
test_stationarity(iq['total_cases'])
# decomposition of the time series using moving average
import statsmodels.api as sm
plt.figure(figsize=(10,5))
res = sm.tsa.seasonal_decompose(sj.total_cases, freq=30)
plt.show(block=False)
resplot = res.plot()
# decomposition of the time series using moving average
import statsmodels.api as sm
plt.figure(figsize=(10,5))
res = sm.tsa.seasonal_decompose(iq.total_cases, freq=30)
resplot = res.plot()
Dengue virus spread through mosquito bite of people infected with dengue virus. #### Possible reasons of spread:
Increase of the number of mosquito in city
Mosquitos might be more active in certain weather conditions
Number of dengue cases reported during a particular week could depend on previous week infection cases as symptoms usually apprear after a few days since infected mosquito bite. Therefore, a person usually reports symptom of illness 2-3 days after the bite. To account for such delay infection cases should also be tested against previous week information (weather data etc)
# removing highly correlated columns
# separating city and date columns
colms=[ 'year', 'weekofyear', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases']
data=df[colms]
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.80:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
selected_col1=['city','week_start_date']+list(selected_columns)
print(selected_col1)
# filters these columns for training and test data sets
# filter columns based on correlation
df1 = df[selected_col1]
#convert in date time datetime
df1['week_start_date']=pd.to_datetime(df1['week_start_date'])
#extract month to new column
df1['month']=df1.week_start_date.dt.month
df1.head()
# add historical average of dengue cases count for particular week in city as a feature and add in data sets (train and test both data set)
# This is basically for each city, on that week, what is the avg num cases over the years.
df1=df1.join(df1.groupby(['city','weekofyear'])['total_cases'].mean(), on=['city','weekofyear'], rsuffix='_avg')
df1.head()
# create dummy predictions for the test set
# The reason I added in the dataset avg is because the predictions were too low.
# It seems like the model needs to know previous actual case counts before it can perform well.
#df2['total_cases']=df2['total_cases']+df2['total_cases_avg']
#df2['random']=np.random.uniform(low=0.8, high=1.5, size=len(test))
#add some randomness
#df2['total_cases']=df2['total_cases_avg']*test['random']
#save file
#df2.to_csv("test_preds_added.csv")
# separate San Juan and Iquitos from train and test data
df1_sj = df1[df1['city']=='sj']
df1_iq = df1[df1['city']=='iq']
# final test data
#df2_sj=df2[df2['city']=='sj']
#df2_iq=df2[df2['city']=='iq']
#loop to make the columns with rolling averages on independent vars
#takes the sum of prior 2 or 4 weeks
#I am picking rolling sum/average of the selected features
rolling_cols_sum=[
'precipitation_amt_mm',
'station_precip_mm'
]
rolling_cols_avg=[
'ndvi_ne',
'ndvi_se',
'reanalysis_air_temp_k',
'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k',
'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent',
'station_avg_temp_c',
'station_max_temp_c',
'station_min_temp_c'
]
# San Juan
# take the sum of prior two week
for col in rolling_cols_sum:
df1_sj['rolling_sum_'+col] = df1_sj[[str(col)]].rolling(window=3,center=False).sum()
#takes the avg of prior 2 weeks
for col in rolling_cols_avg:
df1_sj['rolling_avg_'+col] = df1_sj[[str(col)]].rolling(window=3,center=False).mean()
#Iquitos
# take the sum of prior two week
for col in rolling_cols_sum:
df1_iq['rolling_sum_'+col] = df1_iq[[str(col)]].rolling(window=3,center=False).sum()
#takes the avg of prior 2 weeks
for col in rolling_cols_avg:
df1_iq['rolling_avg_'+col] = df1_iq[[str(col)]].rolling(window=3,center=False).mean()
#fill resulting NaNs from the lag functions
df1_sj.fillna(method='bfill', inplace=True)
df1_iq.fillna(method='bfill', inplace=True)
# splitting trained data for training and testing purpose
# usual machine learning train test split would not work here as time component should be preserved
# as it is time series , we select window of first 800 observation for training and rest are for testing
train_size = int(df1_sj.shape[0] * 0.80)
df1_sj_train, df1_sj_test = df1_sj[0:train_size], df1_sj[train_size:df1_sj.shape[0]]
train_size = int(df1_iq.shape[0] * 0.80)
df1_iq_train, df1_iq_test = df1_iq[0:train_size], df1_iq[train_size:df1_iq.shape[0]]
df1_sj.head()
df1_sj_train.columns
features1=['ndvi_ne','ndvi_se',
'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'station_avg_temp_c',
'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm',
'total_cases_avg','rolling_sum_precipitation_amt_mm', 'rolling_sum_station_precip_mm',
'rolling_avg_ndvi_ne','rolling_avg_ndvi_se',
'rolling_avg_reanalysis_air_temp_k',
'rolling_avg_reanalysis_dew_point_temp_k',
'rolling_avg_reanalysis_max_air_temp_k',
'rolling_avg_reanalysis_min_air_temp_k',
'rolling_avg_reanalysis_precip_amt_kg_per_m2',
'rolling_avg_reanalysis_relative_humidity_percent',
'rolling_avg_station_avg_temp_c', 'rolling_avg_station_max_temp_c',
'rolling_avg_station_min_temp_c']
# feature importance using randomforest/XGBM
X = df1_sj_train[features1]
Y = df1_sj_train['total_cases']
xgb = XGBRegressor()
xgb.fit(X, Y)
plot_importance(xgb)
pyplot.show()
# feature importance using Random forest
X = df1_sj_train[features1]
Y = df1_sj_train['total_cases']
rf = RandomForestRegressor(n_estimators = 20, random_state = 0)
rf.fit(X, Y)
tmp=pd.DataFrame(list(zip(X.columns, rf.feature_importances_)),columns=['importace_col','gini_importance'])
tmp.sort_values(by=['gini_importance'],inplace=True,ascending=False)
tmp.plot(kind='bar',y='gini_importance',x='importace_col',sort_columns=True,)
#list(tmp.importace_col)
# model comparisons for San Juan
from sklearn.model_selection import cross_val_score
X = df1_sj_train[features1]
Y = df1_sj_train['total_cases']
models = [
RandomForestRegressor(n_estimators = 20, random_state = 0),
XGBRegressor(n_estimators=20,),
GradientBoostingRegressor(n_estimators=20),
ElasticNet(),
LinearSVR()
]
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error')
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'neg_mean_absolute_error'])
#cv_df
# box plot for model comparisons
ax=sns.boxplot(x='model_name', y='neg_mean_absolute_error', data=cv_df)
ax.set(xlabel='model_name', ylabel='neg_mean_absolute_error')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
# model comparisons for Iquitos
from sklearn.model_selection import cross_val_score
X = df1_iq_train[features1]
Y = df1_iq_train['total_cases']
models = [
RandomForestRegressor(n_estimators = 20, random_state = 0),
XGBRegressor(n_estimators=20,),
GradientBoostingRegressor(n_estimators=20),
ElasticNet(),
LinearSVR()
]
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error')
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'neg_mean_absolute_error'])
#cv_df
# box plot for model comparisons
ax=sns.boxplot(x='model_name', y='neg_mean_absolute_error', data=cv_df)
ax.set(xlabel='model_name', ylabel='neg_mean_absolute_error')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
# perform gridsearch for hyperparameter tuning for Elastic Net
# San Juan
from sklearn.model_selection import GridSearchCV
import time as time
import numpy as np
#train_size = 100
X = df1_sj_train[features1]
Y = df1_sj_train['total_cases']
param={"alpha": np.arange(0.05,1,0.1),"l1_ratio": np.arange(0,1,0.1)}
#svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,param_grid=param)
#svr = GridSearchCV(ElasticNet(random_state=1),cv=5,param_grid=param)
svr = GridSearchCV(ElasticNet(random_state=1),cv=5,param_grid=param)
t0 = time.time()
svr.fit(X,Y)
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
% svr_fit)
model_svr_sj=svr.best_estimator_
model_svr_sj
model_svr_sj.fit(X,Y)
# perform gridsearch on Elastic Net regressor
# Iquitos
X = df1_iq_train[features1]
Y = df1_iq_train['total_cases']
param={"alpha": np.arange(0.05,1,0.1),"l1_ratio": np.arange(0,1,0.1)}
#svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,param_grid=param)
svr = GridSearchCV(ElasticNet(random_state=1),cv=5,param_grid=param)
t0 = time.time()
svr.fit(X,Y)
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
% svr_fit)
model_svr_iq=svr.best_estimator_
model_svr_iq
model_svr_iq.fit(X,Y)
# training data
preds_sj_svr= model_svr_sj.predict(df1_sj_train[features1]).astype(int)
preds_iq_svr=model_svr_iq.predict(df1_iq_train[features1]).astype(int)
df1_sj_train['predict'] = preds_sj_svr
df1_iq_train['predict'] = preds_iq_svr
### reset axis
df1_sj_train.index = df1_sj_train['week_start_date']
df1_iq_train.index = df1_iq_train['week_start_date']
# plot and compare prediction vs actual
figs, axes = plt.subplots(nrows=2, ncols=1, figsize=(11, 9))
df1_sj_train.total_cases.plot(ax=axes[0], label="Actual")
df1_sj_train.predict.plot(ax=axes[0], label="Predictions")
df1_iq_train.total_cases.plot(ax=axes[1], label="Actual")
df1_iq_train.predict.plot(ax=axes[1], label="Predictions")
plt.suptitle("Dengue Predicted Cases vs. Actual Cases")
plt.legend()
# test prediction
preds_sj_svr= model_svr_sj.predict(df1_sj_test[features1]).astype(int)
preds_iq_svr=model_svr_iq.predict(df1_iq_test[features1]).astype(int)
df1_sj_test['predict'] = preds_sj_svr
df1_iq_test['predict'] = preds_iq_svr
### reset axis
df1_sj_test.index = df1_sj_test['week_start_date']
df1_iq_test.index = df1_iq_test['week_start_date']
# plot and compare prediction vs actual
figs, axes = plt.subplots(nrows=2, ncols=1, figsize=(11, 9))
df1_sj_test.total_cases.plot(ax=axes[0], label="Actual")
df1_sj_test.predict.plot(ax=axes[0], label="Predictions")
df1_iq_test.total_cases.plot(ax=axes[1], label="Actual")
df1_iq_test.predict.plot(ax=axes[1], label="Predictions")
plt.suptitle("Dengue Predicted Cases vs. Actual Cases")
plt.legend()
# first stage data prep for test data set
tmp=selected_col1.copy()
del tmp[-1]
# filtered colum using correlation
test=testdata[tmp]
#convert in date time datetime
test['week_start_date']=pd.to_datetime(test['week_start_date'])
#extract month to new column
test['month']=test.week_start_date.dt.month
test=test.join(df1.groupby(['city','weekofyear'])['total_cases'].mean(), on=['city','weekofyear'], rsuffix='_avg')
test.rename(columns={'total_cases': 'total_cases_avg'}, inplace=True)
test.head()
# final test data
test_sj=test[test['city']=='sj']
test_iq=test[test['city']=='iq']
# rolling sum
rolling_cols_sum=[
'precipitation_amt_mm',
'station_precip_mm'
]
#rolling average
rolling_cols_avg=[
'ndvi_ne',
'ndvi_se',
'reanalysis_air_temp_k',
'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k',
'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent',
'station_avg_temp_c',
'station_max_temp_c',
'station_min_temp_c'
]
# San Juan
# take the sum of prior two week
for col in rolling_cols_sum:
test_sj['rolling_sum_'+col] = test_sj[[str(col)]].rolling(window=3,center=False).sum()
#takes the avg of prior 2 weeks
for col in rolling_cols_avg:
test_sj['rolling_avg_'+col] = test_sj[[str(col)]].rolling(window=3,center=False).mean()
#Iquitos
# take the sum of prior two week
for col in rolling_cols_sum:
test_iq['rolling_sum_'+col] = test_iq[[str(col)]].rolling(window=3,center=False).sum()
#takes the avg of prior 2 weeks
for col in rolling_cols_avg:
test_iq['rolling_avg_'+col] = test_iq[[str(col)]].rolling(window=3,center=False).mean()
#fill resulting NaNs from the lag functions
test_sj.fillna(method='bfill', inplace=True)
test_iq.fillna(method='bfill', inplace=True)
# test_sj.columns
features1=['ndvi_ne','ndvi_se',
'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'station_avg_temp_c',
'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm',
'total_cases_avg','rolling_sum_precipitation_amt_mm', 'rolling_sum_station_precip_mm',
'rolling_avg_ndvi_ne','rolling_avg_ndvi_se',
'rolling_avg_reanalysis_air_temp_k',
'rolling_avg_reanalysis_dew_point_temp_k',
'rolling_avg_reanalysis_max_air_temp_k',
'rolling_avg_reanalysis_min_air_temp_k',
'rolling_avg_reanalysis_precip_amt_kg_per_m2',
'rolling_avg_reanalysis_relative_humidity_percent',
'rolling_avg_station_avg_temp_c', 'rolling_avg_station_max_temp_c',
'rolling_avg_station_min_temp_c']
# feature importance using randomforest/XGBM
#test_sj[features1]
#test_iq[features1]
# first stage test prediction
preds_sj_svr = model_svr_sj.predict(test_sj[features1]).astype(float)
preds_iq_svr = model_svr_iq.predict(test_iq[features1]).astype(float)
test_submission = pd.read_csv("DengAI_Predicting_Disease_Spread_-_Submission_Format.csv",index_col=[0, 1, 2])
test_submission.total_cases = np.concatenate([preds_sj_svr, preds_iq_svr])
test_submission.to_csv("first_stage_result_submission.csv")
#add a total cases column to the test dataframe
test1 = pd.concat([test_sj, test_iq])
test1['pred_total_cases'] = np.concatenate([preds_sj_svr, preds_iq_svr])
test1.head()
# value from previous week
# use in second stage prediction
# creating ARIMA(lag columns) and moving average(rolling) features using target columns
#train
df1_sj['cases_prev_wk1'] = df1_sj['total_cases'].shift(1)
df1_iq['cases_prev_wk1'] = df1_iq['total_cases'].shift(1)
df1_sj['cases_prev_wk2'] = df1_sj['total_cases'].shift(2)
df1_iq['cases_prev_wk2'] = df1_iq['total_cases'].shift(2)
# adding moving average of prediction
df1_sj['total_cases_mw2']=df1_sj['total_cases'].rolling(window=2,center=False).mean()
df1_iq['total_cases_mw2']=df1_iq['total_cases'].rolling(window=2,center=False).mean()
#test
test1_sj=test1[test1['city']=='sj']
test1_iq=test1[test1['city']=='iq']
test1_sj['cases_prev_wk1'] = test1_sj['pred_total_cases'].shift(1)
test1_iq['cases_prev_wk1'] = test1_iq['pred_total_cases'].shift(1)
test1_sj['cases_prev_wk2'] = test1_sj['pred_total_cases'].shift(2)
test1_iq['cases_prev_wk2'] = test1_iq['pred_total_cases'].shift(2)
# adding moving average of prediction
test1_sj['total_cases_mw2']=test1_sj['pred_total_cases'].rolling(window=2,center=False).mean()
test1_iq['total_cases_mw2']=test1_iq['pred_total_cases'].rolling(window=2,center=False).mean()
#need to make sure no NaNs added when creating moving avg or getting previous week values
#bfill use NEXT valid observation to fill gap
df1_sj.fillna(method='bfill', inplace=True)
df1_iq.fillna(method='bfill', inplace=True)
test1_sj.fillna(method='bfill', inplace=True)
test1_iq.fillna(method='bfill', inplace=True)
# selecting features for model
features2=['ndvi_ne', 'ndvi_se',
'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k',
'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'station_avg_temp_c',
'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm',
'month', 'total_cases_avg',
'rolling_sum_precipitation_amt_mm', 'rolling_sum_station_precip_mm',
'rolling_avg_ndvi_ne', 'rolling_avg_ndvi_se',
'rolling_avg_reanalysis_air_temp_k',
'rolling_avg_reanalysis_dew_point_temp_k',
'rolling_avg_reanalysis_max_air_temp_k',
'rolling_avg_reanalysis_min_air_temp_k',
'rolling_avg_reanalysis_precip_amt_kg_per_m2',
'rolling_avg_reanalysis_relative_humidity_percent',
'rolling_avg_station_avg_temp_c', 'rolling_avg_station_max_temp_c',
'rolling_avg_station_min_temp_c', 'cases_prev_wk1','cases_prev_wk2','total_cases_mw2']
# Splitting train and test data from train
train_size = int(df1_sj.shape[0] * 0.80)
df1_sj_train, df1_sj_test = df1_sj[0:train_size], df1_sj[train_size:df1_sj.shape[0]]
train_size = int(df1_iq.shape[0] * 0.80)
df1_iq_train, df1_iq_test = df1_iq[0:train_size], df1_iq[train_size:df1_iq.shape[0]]
# Train
X_sj_train = df1_sj_train[features2]
Y_sj_train = df1_sj_train['total_cases']
# Test
X_iq_test = df1_sj_test[features2]
Y_iq_test = df1_sj_test['total_cases']
# data preparation from final test data ( unseen data)
# TEST
X_sj_t= test1_sj[features2]
X_iq_t= test1_iq[features2]
# Model comparison
# San Juan
X = df1_sj_train[features2]
Y = df1_sj_train['total_cases']
models = [
RandomForestRegressor(n_estimators = 20, random_state = 0),
XGBRegressor(n_estimators=20,),
GradientBoostingRegressor(n_estimators=20),
ElasticNet(),
LinearSVR()
]
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error')
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'neg_mean_absolute_error'])
#cv_df
# box plot for model comparison
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax=sns.boxplot(x='model_name', y='neg_mean_absolute_error', data=cv_df)
ax.set(xlabel='model_name (San Juan)', ylabel='neg_mean_absolute_error')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
#ax.savefig("images/model_comparisons_SanJuan.png")
plt.show()
figure = ax.get_figure()
figure.savefig('images/model_comparisons_SanJuan.png',bbox_inches = 'tight')
# Model creation
#Iquitos
X = df1_iq_train[features2]
Y = df1_iq_train['total_cases']
models = [
RandomForestRegressor(n_estimators = 20, random_state = 0),
XGBRegressor(n_estimators=20,),
GradientBoostingRegressor(n_estimators=20),
ElasticNet(),
LinearSVR()
]
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error')
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'neg_mean_absolute_error'])
#cv_df
# box plot for model comparisons
ax=sns.boxplot(x='model_name', y='neg_mean_absolute_error', data=cv_df)
ax.set(xlabel='model_name(Iqitos)', ylabel='neg_mean_absolute_error')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
figure = ax.get_figure()
figure.savefig('images/model_comparisons_Iqitos.png',bbox_inches = 'tight')
# perform gridsearch with Elastic Net regressor
# San Juan
#train_size = 100
X = df1_sj_train[features2]
Y = df1_sj_train['total_cases']
param={"alpha": np.arange(0.05,1,0.1),"l1_ratio": np.arange(0,1,0.1)}
#svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,param_grid=param)
#svr = GridSearchCV(ElasticNet(random_state=1),cv=5,param_grid=param)
svr = GridSearchCV(ElasticNet(random_state=1),cv=5,param_grid=param)
t0 = time.time()
svr.fit(X,Y)
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
% svr_fit)
model_svr_sj=svr.best_estimator_
model_svr_sj
model_svr_sj.fit(X,Y)
# perform gridsearch with Elastic Net regressor
# Iquitos
X = df1_iq_train[features2]
Y = df1_iq_train['total_cases']
param={"alpha": np.arange(0.05,1,0.1),"l1_ratio": np.arange(0,1,0.1)}
#svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,param_grid=param)
svr = GridSearchCV(ElasticNet(random_state=1),cv=5,param_grid=param)
t0 = time.time()
svr.fit(X,Y)
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
% svr_fit)
model_svr_iq=svr.best_estimator_
model_svr_iq
model_svr_iq.fit(X,Y)
preds_sj_svr= model_svr_sj.predict(df1_sj_train[features2]).astype(int)
preds_iq_svr=model_svr_iq.predict(df1_iq_train[features2]).astype(int)
df1_sj_train['predict'] = preds_sj_svr
df1_iq_train['predict'] = preds_iq_svr
### reset axis
df1_sj_train.index = df1_sj_train['week_start_date']
df1_iq_train.index = df1_iq_train['week_start_date']
# plot and compare prediction vs actual
figs, axes = plt.subplots(nrows=2, ncols=1, figsize=(11, 9))
df1_sj_train.total_cases.plot(ax=axes[0], label="Actual")
df1_sj_train.predict.plot(ax=axes[0], label="Predictions")
df1_iq_train.total_cases.plot(ax=axes[1], label="Actual")
df1_iq_train.predict.plot(ax=axes[1], label="Predictions")
plt.suptitle("Dengue Predicted Cases vs. Actual Cases")
plt.legend()
## Prediction on Test data
# test prediction
preds_sj_svr= model_svr_sj.predict(df1_sj_test[features2]).astype(int)
preds_iq_svr=model_svr_iq.predict(df1_iq_test[features2]).astype(int)
df1_sj_test['predict'] = preds_sj_svr
df1_iq_test['predict'] = preds_iq_svr
### reset axis
df1_sj_test.index = df1_sj_test['week_start_date']
df1_iq_test.index = df1_iq_test['week_start_date']
# plot and compare prediction vs actual
figs, axes = plt.subplots(nrows=2, ncols=1, figsize=(11, 9))
df1_sj_test.total_cases.plot(ax=axes[0], label="Actual")
df1_sj_test.predict.plot(ax=axes[0], label="Predictions")
df1_iq_test.total_cases.plot(ax=axes[1], label="Actual")
df1_iq_test.predict.plot(ax=axes[1], label="Predictions")
plt.suptitle("Dengue Predicted Cases vs. Actual Cases on test sample")
plt.legend()
plt.savefig('images/actual_vs_prediction_for test_sample.png')
# data preparation from final test data
# TEST
X_sj_t= test1_sj[features2]
X_iq_t= test1_iq[features2]
#predict for each city using test set
sj_predictions = model_svr_sj.predict(X_sj_t).astype(int)
iq_predictions = model_svr_iq.predict(X_iq_t).astype(int)
#read in the driven data submission example and add the predictions
df_submission_final = pd.read_csv("DengAI_Predicting_Disease_Spread_-_Submission_Format.csv",index_col=[0, 1, 2])
df_submission_final['total_cases'] = np.concatenate([sj_predictions, iq_predictions])
df_submission_final.to_csv("DengueAI_prediction_2stage.csv")